This class is rather elementary, therefore there won’t be a whole lot of explanation here. The first week provides an overview of all of the courses in the Data Science specialization. The second week provides an introduction to using the command line, github, markdown, and installing packages in R. The third week gives an overview of what data science is, what data is, and briefly discusses experimental design.
I found this class to be extremely useful. It deals with the ins and outs of obtaining and cleaning data from many sources. The techniques discussed here are must-have’s for analyzing data in R.
# melt data (specify id's and measure variables)
mtcars$carname <- rownames(mtcars)
carMelt <- melt(mtcars, id = c("carname", "gear", "cyl"), measure.vars = c("mpg", "hp"))
# cast data using dcast() (left of tilde for rows, right of tilde for columns)
cylData <- dcast(carMelt, cyl ~ variable, mean)
# sum values (split/apply/combine)
library(plyr)
ddply(df, .(factor), summarize, sum=sum(countCOl))
# merge data frames
merge(df1, df2, by.x = "df1Col", by.y = "df2Col", all = TRUE)
# merge using plyr by common id
library(plyr)
arrange(join(df1, df2), id)
# merge multiple dataframes using list
dfList <- list(df1, df2, df3)
join_all(dfList)
# load large data (gets class of each column before loading full dataset to save memory)
initial <- read.table("datafile.txt", nrows = 100)
classes <- sapply(initial, class)
df <- read.table("datafile.txt", colClasses = classes)
# check for subdirectory within working directory
# and create it if it does not exist
if (!file.exists("data")) {
dir.create("data")
}
# read .xlsx file
library(xlsx)
df <- read.xlsx("path/to/file.xlsx", sheetIndex = 1, header = TRUE)
# download file from internet
fileUrl <- ""
download.file(fileUrl, destfile = "./data/filename.csv", method = "curl")
list.files("./data")
# Get date
dateDownloaded <- date()
dateDownloaded
# download xml file from internet
library(XML)
fileUrl <- ""
doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
rootNode <- xmlRoot(doc)
xmlName(rootNode)
names(rootNode)
# download JSON data from internet
library(jsonlite)
jsonData <- fromJSON("url")
names(jsonData)
# MySQL
library(RMySQL)
ucscDb <- dbConnect(MySQL(), user = "genome",
host = "genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDB, "show databases;"); dbDisconnect(ucscDb);
# ex
hg19 <- dbConnect(MySQL(), user = "genome", db = "hg19",
host = "genome-mysql.cse.ucsc,edu")
allTables <- dbListTables(hg19)
length(allTables)
allTables[1:5]
dbListFields(hg19, "affyU133Plus2")
dbGetQuery(hg19, "select count(*) from a affyU133Plus2")
# read from the table
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyU133Plus2)
# select a specific subset
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query); quantile(affyMis$misMatches)
affyMisSmall <- fetch(query, n=10); dbClearResult(query);
dim(affyMisSmall)
dbDisconnect(hg19)
# hdf5 data
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
library(rhdf5)
create = h5createFile("example.h5")
created
# create groups
created = h5createGroup("exmaple.h5", "foo")
created = h5createGroup("exmaple.h5", "baa")
created = h5createGroup("exmaple.h5", "foo/foobaa")
# look at it
h5ls("example.h5")
# write to groups
A = matrix(1:10, nr=5, nc=2)
h5write(A, "example.h5", "foo/A")
B = array(seq(0,1,2.0, by=0.1), dim=c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "example.h5", "foo/foobaa/B")
h5ls("example.h5")
# write a dataset
df = data.frame(1L:5L, seq(0,1, length.out=5),
c("ab", "cde", "fghi", "a", "s"), stringsAsFactors=FALSE)
h5write(df, "example.h5", "df")
h5ls("example.h5")
# reading data
readA = h5read("example.h5", "foo/A")
readB = h5read("example.h5", "foo/foobaa/B")
readdf= h5read("example.h5", "df")
readA
# writing and reading chunks
h5write(c(12,13,14), "example.h5", "foo/A", index=list(1:3,1))
h5read("example.h5", "foo/A")
# webscrapping
con = url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")
htmlCode = readLines(con)
close(con)
htmlCode
# 1
library(XML)
url <- "http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"
html <- htmlTreeParse(url, useInternalNodes=T)
xpathSApply(html, "//title", xmlValue)
xpathSApply(html, "//td[@id='col-citedby']", xmlValue)
# 2
library(httr); html2 = GET(url)
content2 = content(html2, as="text")
parsedHtml = htmlParse(content2, asText=TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
# if webpage requires pw, you must do the following
pg2 = GET("httpbin.org/basic-auth/user/passwd",
authenticate("user", "passwd"))
pg2
# twitter
library(httr)
myapp = oauth_app("twitter",
key = "yourCOnsumerKeyHere", secret = "yourConsumerSecretHere")
sig = sign_oauth1.0(myapp,
token = "yourTokenHere",
token_secret = "yourTokenSecretHere")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)
# converting the json object
json1 = content(homeTL)
json2 = jsonlite::fromJSON(toJSON(json1))
json2[1,1:4]
# Github
library(httr)
# 1. Find OAuth settings for github:
# http://developer.github.com/v3/oauth/
oauth_endpoints("github")
# 2. Register an application at https://github.com/settings/applications
# Insert your values below - if secret is omitted, it will look it up in
# the GITHUB_CONSUMER_SECRET environmental variable.
#
# Use http://localhost:1410 as the callback url
myapp <- oauth_app("github", "2bc549002113db38fcde")
# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)
# 4. Use API
req <- GET("https://api.github.com/rate_limit", config(token = github_token))
stop_for_status(req)
content(req)
# change case
tolower()
toupper()
# split strings
strsplit(x, "\\.")
# strip out first element
firstElement <- function(x) {
x[1]
}
sapply(vector, firstElement)
# substitute characters from vector
sub("oldchar", "newchar", vector)
# substitute characters from vector
gsub("oldchar", "newchar", vector)
# find value
grep("searchstring", df$col)
# find value, show token
grep("searchstring", df$col, value = TRUE)
# find out if value ocurrs
length(grep("searchstring", df$col))
# find value - return logical vector in table
table(grepl("searchstring", df$col))
# subset using grepl
newdf <- olddf[!grepl("searchstring", olddf$col),]
# find # of chars
library(stringr)
nchar("Joseph Casillas")
## [1] 15
# select specific chars (1st-6th char)
substr("Joseph Casillas", 1, 6)
## [1] "Joseph"
# paste elements together
paste("Joseph", "Casillas")
## [1] "Joseph Casillas"
# paste elements together with no space
paste0("Joseph", "Casillas")
## [1] "JosephCasillas"
# remove trailing white space
str_trim("Joseph ")
## [1] "Joseph"
^ matches at the start of a line
^i think$ matches the end of a line
morning$[] (character class) matches either/or inside of the brackets
[Bb][Uu][Ss][Hh] matches any iteration of bush/Bush/bUsH/etc.[]
^[0-9][a-zA-Z] matches any number followed by any letter at the beginning of a line^ inside of a character class ([]) will not match the characters in the class
[^?.]$ will match any line that doesn’t end with a ? or a .. matches any character
9.11 will match a 9, followed by anything, followed by 11| combines two expressions. It translates to “or”
flood|fire will match “flood” or “fire”^[Gg]ood|[Bb]ad will match “good” (upper or lower case) at the beginning of a line or “bad” anywhere^([Gg]ood|[Bb]ad) same as above, but both alternatives are restricted to beginning of the line? directly after () indicates that the enclosed expression is opitonal
[Gg]eorge( [Ww]\.)? [Bb]ush matches “George Bush” and “George W. Bush” (lower or uppercase)* matches whatever is directly before it any number of times (including none). * is greedy (it looks for the max), but can be made ‘less greedy’ by following it with a ? (i.e. ^s(.*?)s$)
(.*) will match anything between partenthesis… “(hello how are you)” and “()”+ matches whatever is directly before it one or more times (not including none)
[0-9]+ (.*)[0-9]+ will match atleast one number followed by any number of characters followed by atleast one number{} are interval quantifiers that specify the minimum and maximum number of matches of an expression. This can be {x,y} (atleast x, but not more than y), {x,} (atleast x matches), or {x} (exactly x matches)
[Bb]ush( +[^ ]+ +){1,5} debate will match “Bush” and “debate” with between one to five words in between them() are scope limiters. Whatever is matched between them can be repeated using escaped numbers, i.e. \1
+([a-zA-Z]+) +\1 + will match repeated words followed and preceeded by a space (i.e. “night night”, “so so”, etc.)%d = day as number (0-31)%a = abbreviated weekday%A = unabbreviated weekday%m = month (00-12)%b = abbreviated month%B = unabbreviated month%y = 2 digit year%Y = 4 digit yeard2 <- Sys.date returns “2014-01-12”. If we run format(d2 "%a %b %d") it outputs as “Sun Jan 12”as.Date(x, "")
x <- c("1jan1960", "2jan1960"); z <- as.Date(x, "%d%b%Y")[1] "1960-01-01" "1960-01-02"z[2] - z[1] returns Time difference of 1 days. This can also be returned as a numeric vector: as.numeric(z[2] - z[1])library(lubridate) makes working with dates easier
twitteR packagerfigsharerplosRFacebookRGoogleMaps?connections
read.XXX
Snippets and functions that are useful for initially setting up the R workspace.
# cleanup global environment
rm(list = ls(all = TRUE))
General purpose gems that everybody should know
# make table of dataset
table(df$col, useNA="ifany")
# xtabs
xt <- xtabs(Freq ~ var1 + var2, data = df)
# check for NAs (other useful commands... any(), all(),)
sum(is.na(df$col))
# row and column sums
colSums(is.na(df))
rowSums(is.na(df))
# size of dataset
print(object.size(df), units = "Mb")
# creating sequences
seq(1, 10, by = 2)
seq(1, 10, length = 3)
seq(along = x)
# generate levels of a factor
gl(2, 8, labels = c("Control", "Treat"))
Subsetting in many different ways.
# find out if there are values w/ specific characteristics
table(df$col %in% c("value1", "value2"))
# subset dataframe based on special characteristics specified above
df[df$col %in% c("value1", "value2"),]
# subset using % % (returns T/F)
df$newcol <- df$someCol %in% c("level1", "level2")
# create binary variables (if condition is true, add true in new col)
df$newcol <- ifelse(df$someCol < 0, TRUE, FALSE)
# 'cut' - create categorical (factor) variable from continuous variable
df$newCol <- cut(df$someCol, breaks = quantile(df$someCol))
# 'cut' with plyr
library(plyr)
newDF <- mutate(oldDF, newCol = cut2(oldCol, g=4))
Genreal programming basics. Useful for all languages.
Mainly for writing programs. When using CLI, better to use *apply.
# ex 1
if(<condition>) {
## do something
} else {
## do something else
}
# ex 2
if(<condition>) {
## do something
} else if(<condition>) {
## do something different
} else {
## do something different
}
x <- c("a", "b", "c", "d")
# Ex 1
for(i in 1:4) {
print(x[i])
}
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
# Ex 2
for(i in seq_along(x)) {
print(x[i])
}
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
# Ex 3
for(letter in x) {
print(letter)
}
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
# Ex 4
for(i in 1:4) print(x[i])
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
x <- matrix(1:6, 2, 3)
for(i in seq_len(nrow(x))) {
for(j in seq_len(ncol(x))) {
print(x[i, j])
}
}
## [1] 1
## [1] 3
## [1] 5
## [1] 2
## [1] 4
## [1] 6
# Ex 1
count <- 0
while(count <= 10) {
print(count)
count <- count +1
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
# Ex 2
z <- 5
while(z >= 3 && z <= 10) {
print(z)
coin <- rbinom(1, 1, 0.5)
if(coin == 1) { ## random walk
z <- z + 1
} else {
z <- z - 1
}
}
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 9
## [1] 10
x0 <- 1
tol <- 1e-8
repeat {
x1 <- computeEstimate()
if(abs(x1 - x0) < tol) {
break
} else {
x0 <- x1
}
}
for(i in 1:100) {
if(i <= 20) {
## skip the first 20 iterations
next
}
## do something here
}
f <- function(<arguments>) {
## do something interesting
}
NULL. Functions use lazy evaluation. This means that if an argument is stipulated in the function, but not included in the function, it will still work correctly.f <- function(a, b) {
a^2
}
f(2)
## [1] 4
This will return 4. 2 is positionally matched to a and b is never evaluated.
f <- function(a, b) {
print(a)
print(b)
}
f(45)
a, but not to b. So when R evaluates b it gives an error because of the missing argument after it has already evaluated a.... argument is good for cases when you are extending the functionality of an already established function.myplot <- function(x, y, type = "1", ...) {
plot(x, y, type = type, ...)
}
plot() to default to type = "1" and use all of the other default arguments of the original function. Generic functions also use ..., but this hasn’t been explained yet. It is also used in functions in which the number of necessary arguments cannot be known in advance.args(paste)
function(..., sep = " ", collapse = NULL)
... have to be named explicitly. If not, R will ignore partial matching or give an error.add2 <- function(x, y) {
x + y
}
above10 <- function(x) {
use <- x > 10
x[use]
}
above <- function(x, n = 10) {
use <- x > n
x[use]
}
n). It will default to 10 if nothing is specified. x <- 1:20
above(x, 12)
## [1] 13 14 15 16 17 18 19 20
columnmean <- function(x, removeNA = TRUE) {
nc <- ncol(x) # get the number of cols of df/matrix
means <- numeric(nc) # initialize vector that will store means of each column
for(i in 1:nc) {
means[i] <- mean(x[, i], na.rm = removeNA)
}
means
}
...Ex.
x <- list(a = 1:5, b = rnorm(10))
lapply(x, mean)
## $a
## [1] 3
##
## $b
## [1] 0.2248305
Ex.
x <- list(a = 1:4, b = rnorm(10), c = rnorm(20, 1), d = rnorm(100, 5))
lapply(x, mean) # returns list
## $a
## [1] 2.5
##
## $b
## [1] -0.147946
##
## $c
## [1] 1.069424
##
## $d
## [1] 5.004428
sapply(x, mean) # returns vector
## a b c d
## 2.500000 -0.147946 1.069424 5.004428
Ex.
lapply(x, fucntion(elt) elt[, 1])
x is an arrayMARGIN is an integer vector indicating which margins should be retained
1 = rows2 = columnsFUN is a functioni to be applied... is for other arguements to be passed to FUNEx.
x <- matrix(rnorm(200), 20, 10)
apply(x, 1, mean)
## [1] -0.14098352 0.51267792 -0.49223090 -0.30394071 -0.14761763
## [6] -0.10044586 -0.45017446 0.44632827 0.30135584 -0.09231023
## [11] -0.22277694 0.38825978 -0.29881366 0.16848697 -0.17860254
## [16] 0.01348669 -0.55289394 0.62264736 0.16391661 0.02703794
apply(x, 2, mean)
## [1] 0.25339701 -0.23285265 -0.26559945 -0.09360610 0.18978693
## [6] -0.13164218 0.30890627 -0.19613112 0.04910631 -0.04966152
x is a vectorINDEX is a factor or a list of factorsFUN is a function to be applied... contains other arguements to be passed to FUNsimplify is a logical to simplify result or not (if FALSE, you get a list)Ex.
vot <- rnorm(20, 2, 5)
group <- gl(2, 10, 20, labels = c("en", "sp"))
treatment <- gl(4, 1, 20, labels = c("beg", "int", "med", "adv"))
df <- as.data.frame(cbind(group, treatment, vot))
tapply(df$vot, df$group, mean)
## 1 2
## 3.579214 5.344494
tapply(df$vot, list(df$group, df$treatment), mean)
## 1 2 3 4
## 1 4.502868 1.558790 8.270788 0.5327972
## 2 -2.235634 4.044246 10.805307 5.8039332
x is a vector (or list) or dff is a factor (or coerced to one) or a list of factorsdrop indicates whether empty factors levels should be droppedlapply() i.e. lapply(split(x, f), mean), but this is pretty much the same as tapply()FUN is a function to apply... contains arguments to apply overMoreArgs is a list of other arguments to FUNSIMPLIFY logical that indicate if the result should be simplified or notEx.
list(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1))
## [[1]]
## [1] 1 1 1 1
##
## [[2]]
## [1] 2 2 2
##
## [[3]]
## [1] 3 3
##
## [[4]]
## [1] 4
mapply(rep, 1:4, 4:1)
## [[1]]
## [1] 1 1 1 1
##
## [[2]]
## [1] 2 2 2
##
## [[3]]
## [1] 3 3
##
## [[4]]
## [1] 4
There are three types of condition, a generic concept for indicating that something unexpected can occur, this can be created by the programmer
message is a generic notification, very tame, function is still executedwarning indicates that something went wrong, but not a fail, function is still executederror indicates that a fatal problem ocurred, execution stopstraceback prints out the function call stack after an error occurs; does nothing if there’s no errordebug flags a fucntion for ‘debug’ mode which allows you to step through execution of a function one line at a timebrowser suspends the execution of a function wherever it is called puts the function in debug modetrace allows you to insert debugging code into a function at specific placesrecover allows you to modify the error behavior so that you can browse the function call stackq = quantile
qnorm(): generate random poisson variates (count data)
Ex. normal dist.
dnorm(x, mean = 0, sd = 1, log = FALSE)
pnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
qnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
rnorm(n, mean = 0, sd = 1)
set.seed(20)
x <- rnorm(100)
e <- rnorm(100, 0, 2)
y <- 0.5 + 2 * x + e
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.4080 -1.5400 0.6789 0.6893 2.9300 6.5050
plot(x, y)
set.seed(10)
x <- rbinom(100, 1, 0.5)
e <- rnorm(100, 0, 2)
y <- 0.5 + 2 * x + e
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.4940 -0.1409 1.5770 1.4320 2.8400 6.9410
plot(x, y)
set.seed(1)
x <- rnorm(100)
log.mu <- 0.5 + 0.3 * x
y <- rpois(100, exp(log.mu))
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 1.00 1.55 2.00 6.00
plot(x, y)
sample(1:10, 4)replace = TRUE so that numbers from sample can be repeatedsummaryRprof() will summarize the output of Rprof()system.time()Ex.
set.seed(1); normals <- rnorm(100, 1, 2)
nLL <- make.NegLogLik(normals)
nLL
make.NegLogLik <- function(data, fixed = c(FALSE, FALSE)) {
params <- fixed
function(p) {
params[!fixed] <- p
mu <- params[1]
sigma <- params[2]
a <- -0.5 * length(data) * log(2 * pi * sigma^2)
b <- -0.5 * sum((data-mu)^2) / (sigma^2)
-(a + b)
}
}
optim(c(mu = 0, sigma = 1), nLL)$par
nll <- make.NegLogLik(normals, c(FALSE, 2))
optimize(nLL, c(-1, 3))$minimum
nll <- make.NegLogLik(normals, c(1, FALSE))
optimize(nLL, c(1e-6, 10))$minimum
boxplot(pollution$mp25, col = "blue")
abline(h = 12)
hist(pollution$pm25, col = "green", breaks = 100)
rug(pollution$pm25
abline(v = 12, lwd = 2)
abline(v = median(pollution$pm25), col = "magenta", lwd = 4)
barplot(table(pollution$region), col = "wheat", main = "Number of counties in each region")
with(pollution, plot(latitude, pm25))
abline(h = 12, lwd = 2, lty = 2)
par(mfrow = c(1, 2), mar = c(5, 4, 2, 1))
with(subset(pollution, region == "west"), plot(latitude, pm25, main = "West"))
with(subset(pollution, region == "east"), plot(latitude, pm25, main = "East"))
Ex.
library(datasets)
data(cars)
with(cars, plot(speed, dist))
Ex.
library(datasets)
hist(airquality$Ozone)
with(airquality, plot(Wind, Ozone))
airquality <- transform(airquality, Month = factor(Month))
boxplot(Ozone ~ Month, airquality, xlab = "Month", ylab = "Ozone (ppb)")
Ex. Base plot with annotation
library(datasets)
with(airquality, plot(Wind, Ozone))
title(main = "Ozone and Wind in New York City")
Ex. Base plot with subset and coloring
library(datasets)
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City"))
with(subset(airquality, Month == 5), points(Wind, Ozone, col = "blue"))
Ex. Base plot with more subsetting and coloring
library(datasets)
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City", type = "n"))
with(subset(airquality, Month == 5), points(Wind, Ozone, col = "blue"))
with(subset(airquality, Month != 5), points(Wind, Ozone, col = "red"))
legend("topright", pch = 1, col = c("blue", "red"), legend = c("May", "Other months"))
Ex. Base plot with regression line
library(datasets)
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City", pch = 20))
model <- lm(Ozone ~ Wind, data = airquality)
abline(model, lwd = 2)
Ex. Multiple base plots
library(datasets)
model1 <- lm(Ozone ~ Wind, data = airquality)
model2 <- lm(Ozone ~ Solar.R, data = airquality)
par(mfrow = c(2, 1))
with(airquality, {
plot(Wind, Ozone, main = "Ozone and Wind")
abline(model1, lwd = 2)
plot(Solar.R, Ozone, main = "Ozone and Solar Radiation")
abline(model2, lwd = 2)
})
library(datasets)
model1 <- lm(Ozone ~ Wind, data = airquality)
model2 <- lm(Ozone ~ Solar.R, data = airquality)
model3 <- lm(Ozone ~ Temp, data = airquality)
par(mfrow = c(1, 3), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0))
with(airquality, {
plot(Wind, Ozone, main = "Ozone and Wind")
abline(model1, lwd = 2)
plot(Solar.R, Ozone, main = "Ozone and Solar Radiation")
abline(model2, lwd = 2)
plot(Temp, Ozone, main = "Ozone and Temperature")
abline(model3, lwd = 2)
mtext("Ozone and Weather in NYC", outer = TRUE)
})
Ex. Testing it out on my own…
x <- rnorm(100)
y <- x + rnorm(100)/3
g <- gl(2, 50, labels = c("Male", "Female"))
fit <- lm(y ~ x)
plot(x, y, type = "n")
points(x[g == "Male"], y[g == "Male"], col = "blue")
points(x[g == "Female"], y[g == "Female"], col = "red")
abline(fit, lwd = 2, col = "green")
legend("topleft", legend = c("Male", "Female"), pch = 1, col = c("blue", "red"))
title("Plot title")
Ex. code
library(datasets)
with(faithful, plot(eruptions, waiting)) # create plot
dev.copy(png, file = "testplot.png") # copy plot in screen device to a png file
dev.off() # close png device
p <-) and then printing said object (i.e. print(p))xyplot(y ~ x | f * g, data = )
Ex. Basic scatterplot
library(lattice)
library(datasets)
xyplot(Ozone ~ Wind, data = airquality)
Ex. Scatterplot at distinct levels
library(datasets)
library(lattice)
airquality <- transform(airquality, Month = factor(Month))
xyplot(Ozone ~ Wind | Month, data = airquality, layout = c(5, 1))
Ex.
library(lattice)
state <- data.frame(state.x77, region = state.region)
xyplot(Life.Exp ~ Income | region, data = state, layout = c(4,1))
set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x + rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
xyplot(y ~ x | f, layout = c(2, 1))
Ex. w/ custom panel function (median line)
set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x + rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
xyplot(y ~ x | f, panel = function(x, y, ...) {
panel.xyplot(x, y, ...) # call default panel function to plot points
panel.abline(h = median(y), lty = 2) # add horizontal line at median
})
Ex. w/ custom panel function (regression line)
set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x + rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
xyplot(y ~ x | f, panel = function(x, y, ...) {
panel.xyplot(x, y, ...) # call default panel function to plot points
panel.lmline(x, y, col = 2) # add simple lm line
})
qplot() is the workhorse, similar to plot() in base rggplot() is core function, can do things qplot() cannotEx. 1 - Hello world
library(ggplot2)
## Loading required package: methods
data(mpg)
str(mpg)
## 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: Factor w/ 15 levels "audi","chevrolet",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ model : Factor w/ 38 levels "4runner 4wd",..: 2 2 2 2 2 2 2 3 3 3 ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : Factor w/ 10 levels "auto(av)","auto(l3)",..: 4 9 10 1 4 9 1 9 4 10 ...
## $ drv : Factor w/ 3 levels "4","f","r": 2 2 2 2 2 2 2 1 1 1 ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : Factor w/ 5 levels "c","d","e","p",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ class : Factor w/ 7 levels "2seater","compact",..: 2 2 2 2 2 2 2 2 2 2 ...
# 'hello world' of ggplot
qplot(displ, hwy, data = mpg)
qplot(displ, hwy, data = mpg, color = drv)
qplot(displ, hwy, data = mpg, geom = c("point", "smooth"))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
qplot(displ, hwy, data = mpg, color = drv) + geom_smooth(method = "lm")
Ex. 2 - Histograms
# make historgram by only specifying one variable
qplot(hwy, data = mpg, fill = drv)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Ex. 3 - Facets
factor ~ factor. indicates that no variable is specified and usually equates to 1 row/columnqplot(displ, hwy, data = mpg, facets = .~drv)
qplot(hwy, data = mpg, facets = drv ~., binwidth = 2)
colorRamp()colorRampPalette()Ex. Return RGB
pal <- colorRamp(c("red", "blue"))
pal(0)
## [,1] [,2] [,3]
## [1,] 255 0 0
pal(1)
## [,1] [,2] [,3]
## [1,] 0 0 255
pal(0.5)
## [,1] [,2] [,3]
## [1,] 127.5 0 127.5
pal(seq(0, 1, len = 10))
## [,1] [,2] [,3]
## [1,] 255.00000 0 0.00000
## [2,] 226.66667 0 28.33333
## [3,] 198.33333 0 56.66667
## [4,] 170.00000 0 85.00000
## [5,] 141.66667 0 113.33333
## [6,] 113.33333 0 141.66667
## [7,] 85.00000 0 170.00000
## [8,] 56.66667 0 198.33333
## [9,] 28.33333 0 226.66667
## [10,] 0.00000 0 255.00000
Ex. Return hexadecimals (like HTML)
pal <- colorRampPalette(c("red", "yellow"))
pal(2)
## [1] "#FF0000" "#FFFF00"
pal(10)
## [1] "#FF0000" "#FF1C00" "#FF3800" "#FF5500" "#FF7100" "#FF8D00" "#FFAA00"
## [8] "#FFC600" "#FFE200" "#FFFF00"
Ex. Creating color combinations
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 3.1.2
cols <- brewer.pal(3, "BuGn")
pal <- colorRampPalette(cols)
image(volcano, col = pal(20))
Ex. Smooth scatter function
x <- rnorm(10000)
y <- rnorm(10000)
smoothScatter(x, y)
Ex.
set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))
Ex. Create a distance matrix
dataFrame <- data.frame(x = x, y = y)
# defaults to euclidean distance
dist(dataFrame)
## 1 2 3 4 5 6
## 2 0.34120511
## 3 0.57493739 0.24102750
## 4 0.26381786 0.52578819 0.71861759
## 5 1.69424700 1.35818182 1.11952883 1.80666768
## 6 1.65812902 1.31960442 1.08338841 1.78081321 0.08150268
## 7 1.49823399 1.16620981 0.92568723 1.60131659 0.21110433 0.21666557
## 8 1.99149025 1.69093111 1.45648906 2.02849490 0.61704200 0.69791931
## 9 2.13629539 1.83167669 1.67835968 2.35675598 1.18349654 1.11500116
## 10 2.06419586 1.76999236 1.63109790 2.29239480 1.23847877 1.16550201
## 11 2.14702468 1.85183204 1.71074417 2.37461984 1.28153948 1.21077373
## 12 2.05664233 1.74662555 1.58658782 2.27232243 1.07700974 1.00777231
## 7 8 9 10 11
## 2
## 3
## 4
## 5
## 6
## 7
## 8 0.65062566
## 9 1.28582631 1.76460709
## 10 1.32063059 1.83517785 0.14090406
## 11 1.37369662 1.86999431 0.11624471 0.08317570
## 12 1.17740375 1.66223814 0.10848966 0.19128645 0.20802789
Ex. Dendrogram
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)
Ex. K-means clustering
set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))
dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
# plot kmeans()
par(mar = rep(0.2, 4))
plot(x, y, col = kmeansObj$cluster, pch = 19, cex = 2)
points(kmeansObj$centers, col = 1:3, pch = 3, cex = 3, lwd = 3)
Ex. with heatmap
set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj2 <- kmeans(dataMatrix, centers = 3)
par(mfrow = c(1, 2), mar = c(2, 4, 0.1, 0.1))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n")
Ex. SVD
set.seed(12345)
par(mar = rep(0.2, 4))
dataMatrix <- matrix(rnorm(400), nrow = 40)
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])
par(mar = rep(0.2, 4))
heatmap(dataMatrix)
# add a pattern
set.seed(678910)
for (i in 1:40) {
# flip a coin
coinflip <- rbinom(1, size = 1, prob = 0.5)
# if coin is heads add a common pattern to that row
if (coinflip) {
dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 3), each = 5)
}
}
par(mar = rep(0.2, 4))
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])
heatmap(dataMatrix)
hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rowMeans(dataMatrixOrdered), 40:1, xlab = "Row means", ylab = "Row", pch = 19)
plot(colMeans(dataMatrixOrdered), xlab = "Column", ylab = "Column mean", pch = 19)
svd1 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd1$u[, 1], 40:1, xlab = "Row", ylab = "First left singular vector", pch = 19)
plot(svd1$v[, 1], xlab = "Column", ylab = "First right singular vector", pch = 19)
Ex. Components of the SVD - variance explained
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)
Ex. Relationship to principal components
svd1 <- svd(scale(dataMatrixOrdered))
pca1 <- prcomp(dataMatrixOrdered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1], pch = 19, xlab = "Principal Component 1", ylab = "Right singular vector 1")
abline(c(0, 1))
Ex. More variance explained
constantMatrix <- dataMatrixOrdered*0
for(i in 1:dim(dataMatrixOrdered)[1]){constantMatrix[i, ] <- rep(c(0, 1), each = 5)}
svd1 <- svd(constantMatrix)
par(mfrow = c(1, 3))
image(t(constantMatrix)[, nrow(constantMatrix):1])
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)
Ex. Add a second pattern
set.seed(678910)
for (i in 1:40) {
# flip a coin
coinflip1 <- rbinom(1, size = 1, prob = 0.5)
coinflip2 <- rbinom(1, size = 1, prob = 0.5)
# if coin is heads add a common pattern to that row
if (coinflip1) {
dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), each = 5)
}
if (coinflip2) {
dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), 5)
}
}
hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
Ex. True patterns
svd2 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rep(c(0, 1), each = 5), pch = 19, xlab = "Column", ylab = "Pattern 1")
plot(rep(c(0, 1), 5), pch = 19, xlab = "Column", ylab = "Pattern 2")
svd2 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd2$v[, 1], pch = 19, xlab = "Column", ylab = "First right singular vector")
plot(svd2$v[, 2], pch = 19, xlab = "Column", ylab = "second right singular vector")
svd1 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)
dataMatrix2 <- dataMatrixOrdered
## Randomly insert some missing data
dataMatrix2[sample(1:100, size = 40, replace = FALSE)] <- NA
library(impute) # not installed
dataMatrix2 <- impute.knn(dataMatrix2)$data
svd1 <- svd(scale(dataMatrixOrdered)); svd2 <- svd(scale(dataMatrix2))
par(mfrow = c(1, 2)); plot(svd1$v[, 1], pch = 19); plot(svd2$v[, 1], pch = 19)
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
data(sleep); head(sleep)
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
g1 <- sleep$extra[1:10]; g2 <- sleep$extra[11:20]
difference <- g2 - g1
mn <- mean(difference); s <- sd(difference); n <- 10
# confidence interval
mn + c(-1, 1) * qt(.975, n-1) * s / sqrt(n)
## [1] 0.7001142 2.4598858
t.test(difference)
##
## One Sample t-test
##
## data: difference
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.7001142 2.4598858
## sample estimates:
## mean of x
## 1.58
t.test(g2, g1, paired = TRUE)
##
## Paired t-test
##
## data: g2 and g1
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.7001142 2.4598858
## sample estimates:
## mean of the differences
## 1.58
t.test(extra ~ I(relevel(group, 2)), paired = TRUE, data = sleep)
##
## Paired t-test
##
## data: extra by I(relevel(group, 2))
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.7001142 2.4598858
## sample estimates:
## mean of the differences
## 1.58
# unpaired, assumed = variance
# x_oc = 132.86 sd = 15.34, n = 8
# x_c = 127.44 sd = 18.23, n = 21
# sd of variance
sp <- sqrt((7 * 15.34^2 + 20 * 18.23) / (8 + 21 - 2))
# interval
132.86 - 127.44 + c(-1, 1) * qt(.975, 27) * sp * (1 / 8 + 1 / 21)^.5
## [1] -1.938637 12.778637
t.test(y ~ x, paired = TRUE, var.equal = TRUE, data = df)$conf t.test(y ~ x, paired = FALSE)$conf
t.test() function in conjuntion with $conf.intlibrary(datasets); data(mtcars)
round(t.test(mtcars$mpg)$conf.int)
## [1] 18 22
## attr(,"conf.level")
## [1] 0.95
round(qt(.975, df = 8) * 30 / 3, 2)
## [1] 23.06
m4 <- mtcars$mpg[mtcars$cyl == 4]
m6 <- mtcars$mpg[mtcars$cyl == 6]
#this does 4 - 6
as.vector(t.test(m4, m6, var.equal = TRUE)$conf.int)
## [1] 3.154286 10.687272
n1 <- n2 <- 9
x1 <- -3 ##treated
x2 <- 1 ##placebo
s1 <- 1.5 ##treated
s2 <- 1.8 ##placebo
spsq <- ( (n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)
# Equivalent calculations of power using N, delta, and SD
power.t.test(n = 16, delta = 2/4, type = 'one.sample', alt = 'one.sided')$power
## [1] 0.6040329
power.t.test(n = 16, delta = 2, sd = 4, type = 'one.sample', alt = 'one.sided')$power
## [1] 0.6040329
power.t.test(n = 16, delta = 100, sd = 200, type = "one.sample", alt = 'one.sided')$power
## [1] 0.6040329
# Equivalent calculations of sample size using power, delta and SD
power.t.test(power = 0.8, delta = 2/4, sd = 1, type = 'one.sample', alt = 'one.sided')$n
## [1] 26.13751
power.t.test(power = 0.8, delta = 2, sd = 4, type = 'one.sample', alt = 'one.sided')$n
## [1] 26.13751
power.t.test(power = 0.8, delta = 100, sd = 200, type = 'one.sample', alt = 'one.sided')$n
## [1] 26.13751
library(UsingR); data(father.son)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.1.2
## Loading required package: grid
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
##
## Loading required package: quantreg
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
##
##
## Attaching package: 'quantreg'
##
## The following object is masked from 'package:Hmisc':
##
## latex
##
## The following object is masked from 'package:survival':
##
## untangle.specials
##
##
## Attaching package: 'UsingR'
##
## The following object is masked from 'package:survival':
##
## cancer
##
## The following object is masked from 'package:ggplot2':
##
## movies
x <- father.son$sheight
n <- length(x)
B <- 10000
resamples <- matrix(sample(x, n * B, replace = TRUE), B, n)
resampledMedians <- apply(resamples, 1, median)
sd(resampledMedians)
## [1] 0.08370475
quantile(resampledMedians, c(0.025, 0.975))
## 2.5% 97.5%
## 68.43733 68.81461
g <- ggplot(data.frame(resampledMedians = resampledMedians), aes(x = resampledMedians))
g + geom_histogram(color = 'black', fill = 'lightblue', binwidth = 0.05)
subdata <- InsectSprays[InsectSprays$spray %in% c("B", "C"), ]
y <- subdata$count
group <- as.character(subdata$spray)
testStat <- function(w, g) mean(w[g == "B"]) - mean(w[g == "C"])
observedStat <- testStat(y, group)
permutations <- sapply(1:10000, function(i) testStat(y, sample(group)))
observedStat
## [1] 13.25
mean(permutations > observedStat)
## [1] 0
library(UsingR); data(galton)
par(mfrow = c(1, 2))
hist(galton$child, col = "blue", breaks = 100)
hist(galton$parent, col = "blue", breaks = 100)
library(manipulate)
myHist <- function(mu){
hist(galton$child,col="blue",breaks=100)
lines(c(mu, mu), c(0, 150),col="red",lwd=5)
mse <- mean((galton$child - mu)^2)
text(63, 150, paste("mu = ", mu))
text(63, 140, paste("MSE = ", round(mse, 2)))
}
manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
plot(galton$parent,galton$child,col="blue")
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
plot(as.numeric(as.vector(freqData$parent)),
as.numeric(as.vector(freqData$child)),
pch = 21, col = "black", bg = "lightblue",
cex = .15 * freqData$freq,
xlab = "parent", ylab = "child")
myPlot <- function(beta){
y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
plot(
as.numeric(as.vector(freqData$parent)),
as.numeric(as.vector(freqData$child)),
pch = 21, col = "black", bg = "lightblue",
cex = .15 * freqData$freq,
xlab = "parent",
ylab = "child"
)
abline(0, beta, lwd = 3)
points(0, 0, cex = 2, pch = 19)
mse <- mean( (y - beta * x)^2 )
title(paste("beta = ", beta, "mse = ", round(mse, 3)))
}
manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))
Empiracle mean (central tendency)
\[
\bar X = \frac{1}{n}\sum_{i=1}^n X_i.
\]
centeringThe mean is the least squares solution for minimzing
\[
\sum_{i=1}^n (X_i - \mu)^2
\]
This is called scaling the data
Empiracle covariance:
y <- galton$child
x <- galton$parent
beta1 <- cor(y, x) * sd(y) / sd(x)
beta0 <- mean(y) - beta1 * mean(x)
rbind(c(beta0, beta1), coef(lm(y ~ x)))
## (Intercept) x
## [1,] 23.94153 0.6462906
## [2,] 23.94153 0.6462906
yc <- y - mean(y)
xc <- x - mean(x)
beta1 <- sum(yc * xc) / sum(xc^2)
c(beta1, coef(lm(y ~ x))[2])
## x
## 0.6462906 0.6462906
library(UsingR)
data(father.son)
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
rho <- cor(x, y)
myPlot <- function(x, y) {
plot(x, y,
xlab = "Father's height, normalized",
ylab = "Son's height, normalized",
xlim = c(-3, 3), ylim = c(-3, 3),
bg = "lightblue", col = "black", cex = 1.1, pch = 21,
frame = FALSE)
}
myPlot(x, y)
abline(0, 1) # if there were perfect correlation
abline(0, rho, lwd = 2) # father predicts son
abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis
abline(h = 0); abline(v = 0) # reference lines for no relathionship
library(UsingR); data (diamond)
with(diamond, plot(carat, price,
frame = FALSE,
xlab = "Mass (carats)",
ylab = "Price (SIN $)",
bg = "lightblue",
col = "black", cex = 1.1, pch = 21))
abline(lm(price ~ carat, data = diamond), lwd = 2)
Check parameters
fit <- lm(price ~ carat, data = diamond)
coef(fit)
## (Intercept) carat
## -259.6259 3721.0249
Make the intercept more interpretable by subtracting the mean of the predictor from the predictor (centering).
fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
coef(fit2)
## (Intercept) I(carat - mean(carat))
## 500.0833 3721.0249
Changing scale to make slope more interpretable: change 1 carat to 1/10th of a carat by dividing the coeficient by 10 (or multiplying the predictor by 10 in the model formula)
fit3 <- lm(price ~ I(carat * 10), data = diamond)
coef(fit3)
## (Intercept) I(carat * 10)
## -259.6259 372.1025
Making predictions of the price of diamonds (select a value for carats and multiple it by the slope and add the intercept)
newx <- c(0.16, 0.27, 0.34)
coef(fit)[1] + coef(fit)[2] * newx
## [1] 335.7381 745.0508 1005.5225
This can also be accomplished automatically withthe predict() function (note: the new data you want to predict must be in a data frame and the variable must have the same name as the predictor in the model)
predict(fit, newdata = data.frame(carat = newx))
## 1 2 3
## 335.7381 745.0508 1005.5225
y <- diamond$price; x <- diamond$carat; n <- length(y)
fit <- lm(y ~ x)
e <- resid(fit)
yhat <- predict(fit)
max(abs(e - (y - yhat)))
## [1] 9.485746e-13
max(abs(e - (y - coef(fit)[1] - coef(fit)[2] * diamond$carat)))
## [1] 9.485746e-13
plot(x, resid(fit), frame = FALSE)
abline(h = 0)
x <- runif(100, -3, 3); y <- x + sin(x) + rnorm(100, sd = .2);
par(mfrow = c(1, 2))
plot(x, y); abline(lm(y ~ x))
plot(x, resid(lm(y ~ x))); abline(h = 0)
x <- runif(100, 0, 6); y <- x + rnorm(100, mean = 0, sd = 0.001 * x);
par(mfrow = c(1, 2))
plot(x, y); abline(lm(y ~ x))
plot(x, resid(lm(y ~ x))); abline(h = 0)
summary() functionsummary(fit)$sigma
## [1] 31.84052
y <- diamond$price; x <- diamond$carat; n <- length(y)
beta1 <- cor(y, x) * sd(y) / sd(x)
beta0 <- mean(y) - beta1 * mean(x)
e <- y - beta0 - beta1 * x
sigma <- sqrt(sum(e^2) / (n-2))
ssx <- sum((x - mean(x))^2)
seBeta0 <- (1/n + mean(x)^2 / ssx)^.5 * sigma
seBeta1 <- sigma / sqrt(ssx)
tBeta0 <- beta0/seBeta0; tBeta1 <- beta1 / seBeta1
pBeta0 <- 2 * pt(abs(tBeta0), df = n - 2, lower.tail = FALSE)
pBeta1 <- 2 * pt(abs(tBeta1), df = n - 2, lower.tail = FALSE)
coefTable <- rbind(c(beta0, seBeta0, tBeta0, pBeta0), c(beta1, seBeta1, tBeta1, pBeta1))
colnames(coefTable) <- c("Estimate", "Std. error", "t value", "p(>|t|)")
rownames(coefTable) <- c("(Intercept)", "x")
print(coefTable)
## Estimate Std. error t value p(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
summary(fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
sumCoef <- summary(fit)$coefficients
sumCoef[1, 1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[1, 2]
## [1] -294.4870 -224.7649
sumCoef[2, 1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[2, 2]
## [1] 3556.398 3885.651
plot(x, y, frame = FALSE, xlab = "", ylab = "",
pch = 21, col = "black", bg = "lightblue",
cex = 2)
abline(fit, lwd = 2)
xVals <- seq(min(x), max(x), by = 0.01)
yVals <- beta0 + beta1 * xVals
se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx)
se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx)
lines(xVals, yVals + 2 * se1)
lines(xVals, yVals - 2 * se1)
lines(xVals, yVals + 2 * se2)
lines(xVals, yVals - 2 * se2)
newdata <- data.frame(x = xVals)
p1 <- predict(fit, newdata, interval = ("confidence"))
p2 <- predict(fit, newdata, interval = ("prediction"))
plot(x, y, frame = FALSE, pch = 21, col = "black", bg = "lightblue", cex = 2)
abline(fit, lwd = 2)
lines(xVals, p1[, 2]); lines(xVals, p1[, 3])
lines(xVals, p2[, 2]); lines(xVals, p2[, 3])
set.seed(1)
n <- 100; x <- runif(n)
z <- rep(c(0, 1), c(n/2, n/2))
#Int #Slope #Trtmnt #Var
beta0 <- 0; beta1 <- 2; tau <- 1; sigma <- 0.2
# y = B0 + B1Xi + B2ti + E
# y = B0 + B1 * Xi + B2 * ti + Ei
# y = slope + regressor * slop coef + trtm effect * tef param + E
y <- beta0 + x * beta1 + z * tau + rnorm(n, sd = sigma)
mod <- lm(y ~ x * z)
plot(x, y, frame = F)
abline(c(mod$coeff[1], mod$coeff[2]), col = 'black')
abline(c(mod$coeff[1] + mod$coeff[3],
mod$coeff[2] + mod$coeff[4]), col = 'black')
# points(x, y, pch = 21, col = ((z == "0")*1+1), bg = 'red')
points(x, y, pch = 21, col = ((z == "1")*1+1), bg = 'blue')
abline(a = mean(y), b = 0)
abline(v = mean(x))
rstandard: standardized residuals (resid / sd)rstudent: standardized residuals (ith data point deleted to follow t-distribution)hatvalues: measures of leverage round(hatvalues(model)[1:10], 3)dffits: change in the predicted resonse when the ith point is deleted in fitting the modeldfbetas: change in ind. coeff when the ith point is delted in fitting hte model round(dfbetas(model)[1:10], 3)cooks.distance: overall change in the coeff. when the ith point in deletedresid: resturns ordinary residualsresid(fit) / (1 - hatvalues(fit)): PRESS residuals, the ‘leave out one cross validation residuals’, the diff. in the respoinse and the predicted response at data point i, where it was not included in the model fittingrequire(stats); data(swiss)
fit1 <- lm(Fertility ~ Agriculture, data = swiss)
a <- summary(fit1)$cov.unscaled[2, 2]
fit2 <- update(fit1, Fertility ~ Agriculture + Examination)
fit3 <- update(fit1, Fertility ~ Agriculture + Examination + Education)
c(summary(fit2)$cov.unscaled[2, 2],
summary(fit3)$cov.unscaled[2, 2]) / a
## [1] 1.891576 2.089159
# vif(fit1)
# sqrt(vif(fit1))
anova(fit1, fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Fertility ~ Agriculture
## Model 2: Fertility ~ Agriculture + Examination
## Model 3: Fertility ~ Agriculture + Examination + Education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 6283.1
## 2 44 4072.7 1 2210.38 29.880 2.172e-06 ***
## 3 43 3180.9 1 891.81 12.056 0.001189 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(caret); library(kernlab); data(spam)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
dim(training)
## [1] 3451 58
set.seed(32343)
modelFit <- train(type ~ ., data = training, method = "glm")
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
##
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.9173753 0.8268281 0.01276529 0.02453941
##
##
modelFit$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) make address
## -1.681e+00 -5.893e-01 -1.349e-01
## all num3d our
## 4.432e-02 3.478e+00 4.483e-01
## over remove internet
## 1.187e+00 2.056e+00 5.285e-01
## order mail receive
## 7.244e-01 8.545e-02 -1.999e-02
## will people report
## -1.067e-01 1.173e-01 5.179e-02
## addresses free business
## 7.939e-01 1.102e+00 9.371e-01
## email you credit
## 1.846e-01 1.112e-01 8.711e-01
## your font num000
## 2.595e-01 1.058e-01 2.597e+00
## money hp hpl
## 3.641e-01 -2.005e+00 -9.171e-01
## george num650 lab
## -1.272e+01 3.517e-01 -2.116e+00
## labs telnet num857
## -1.082e+00 -1.238e-01 1.419e+00
## data num415 num85
## -5.260e-01 1.513e+00 -1.682e+00
## technology num1999 parts
## 1.005e+00 1.898e-01 -6.596e-01
## pm direct cs
## -8.101e-01 -2.300e-01 -5.800e+02
## meeting original project
## -3.177e+00 -2.466e+00 -1.639e+00
## re edu table
## -7.808e-01 -1.829e+00 -1.740e+00
## conference charSemicolon charRoundbracket
## -3.539e+00 -1.178e+00 -1.099e-01
## charSquarebracket charExclamation charDollar
## -5.028e-01 2.283e-01 5.702e+00
## charHash capitalAve capitalLong
## 3.711e+00 8.134e-03 1.029e-02
## capitalTotal
## 9.172e-04
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 1321 AIC: 1437
predictions <- predict(modelFit, newdata = testing)
predictions
## [1] spam spam spam spam spam spam spam spam
## [9] spam spam spam spam spam nonspam spam nonspam
## [17] spam spam spam spam spam nonspam spam spam
## [25] nonspam spam spam spam spam spam spam spam
## [33] spam spam spam spam spam nonspam spam spam
## [41] spam spam spam nonspam nonspam spam spam spam
## [49] spam spam spam nonspam spam nonspam spam nonspam
## [57] spam spam spam nonspam spam spam spam spam
## [65] spam spam spam spam spam spam spam spam
## [73] spam nonspam nonspam spam spam nonspam spam nonspam
## [81] spam spam nonspam spam nonspam spam spam spam
## [89] spam spam spam spam spam spam spam spam
## [97] spam spam spam spam nonspam spam spam spam
## [105] spam spam spam spam spam spam spam spam
## [113] nonspam spam spam spam spam nonspam spam spam
## [121] spam spam spam spam spam nonspam spam spam
## [129] spam spam spam spam spam spam spam nonspam
## [137] spam spam spam spam spam spam spam spam
## [145] spam spam spam spam spam spam spam spam
## [153] spam nonspam spam spam spam spam spam nonspam
## [161] spam spam spam spam spam spam spam spam
## [169] spam spam spam spam spam spam spam nonspam
## [177] spam spam spam spam spam spam spam spam
## [185] nonspam spam spam spam spam spam spam nonspam
## [193] spam spam spam spam spam spam spam spam
## [201] spam spam spam spam nonspam spam spam spam
## [209] spam nonspam spam spam spam spam spam nonspam
## [217] spam spam nonspam spam spam spam spam spam
## [225] spam spam spam spam spam spam nonspam nonspam
## [233] nonspam spam spam spam spam spam spam spam
## [241] spam spam spam spam nonspam spam spam spam
## [249] spam spam spam spam spam nonspam spam spam
## [257] spam spam spam spam spam spam spam spam
## [265] spam spam spam spam spam spam spam spam
## [273] spam spam nonspam spam spam spam spam spam
## [281] spam spam spam spam spam spam spam spam
## [289] spam spam spam spam spam spam nonspam spam
## [297] spam spam spam spam spam spam spam spam
## [305] spam spam nonspam spam spam spam nonspam spam
## [313] spam spam spam spam spam spam nonspam spam
## [321] spam spam spam spam spam spam spam spam
## [329] spam spam spam spam spam spam spam spam
## [337] spam spam spam spam spam spam nonspam spam
## [345] spam spam spam spam spam nonspam spam nonspam
## [353] spam spam spam spam spam spam spam nonspam
## [361] nonspam spam spam spam spam spam spam spam
## [369] spam nonspam spam spam spam spam spam spam
## [377] nonspam spam spam spam nonspam spam spam spam
## [385] nonspam spam spam spam nonspam nonspam nonspam spam
## [393] spam spam spam nonspam spam spam spam spam
## [401] spam nonspam spam spam spam spam nonspam spam
## [409] spam nonspam nonspam spam spam spam spam nonspam
## [417] spam spam spam spam spam nonspam nonspam nonspam
## [425] spam spam spam nonspam spam spam spam spam
## [433] nonspam spam spam spam spam nonspam spam spam
## [441] spam spam spam spam spam spam spam spam
## [449] spam spam spam spam spam nonspam nonspam nonspam
## [457] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [465] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [473] nonspam nonspam nonspam nonspam nonspam nonspam spam nonspam
## [481] spam nonspam nonspam nonspam nonspam nonspam spam nonspam
## [489] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [497] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [505] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [513] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [521] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [529] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [537] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [545] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [553] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [561] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [569] nonspam spam spam nonspam nonspam nonspam spam nonspam
## [577] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [585] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [593] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [601] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [609] nonspam nonspam nonspam nonspam spam nonspam nonspam nonspam
## [617] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [625] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [633] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [641] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [649] nonspam nonspam nonspam nonspam spam nonspam nonspam nonspam
## [657] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [665] nonspam nonspam nonspam nonspam nonspam nonspam spam nonspam
## [673] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [681] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [689] spam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [697] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [705] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [713] nonspam nonspam nonspam spam nonspam nonspam nonspam nonspam
## [721] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [729] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [737] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [745] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [753] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [761] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [769] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [777] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [785] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [793] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [801] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [809] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [817] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [825] nonspam nonspam spam nonspam spam nonspam nonspam nonspam
## [833] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [841] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [849] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [857] spam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [865] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [873] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [881] nonspam nonspam nonspam nonspam spam nonspam nonspam nonspam
## [889] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [897] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [905] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [913] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [921] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [929] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [937] nonspam nonspam spam nonspam nonspam nonspam nonspam nonspam
## [945] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [953] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [961] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [969] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [977] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [985] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [993] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1001] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1009] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1017] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1025] nonspam nonspam nonspam spam nonspam nonspam nonspam nonspam
## [1033] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [1041] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [1049] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1057] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1065] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1073] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1081] nonspam nonspam spam nonspam nonspam nonspam nonspam spam
## [1089] nonspam nonspam nonspam nonspam nonspam nonspam spam nonspam
## [1097] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1105] nonspam nonspam nonspam nonspam nonspam spam nonspam spam
## [1113] spam spam nonspam nonspam nonspam nonspam nonspam nonspam
## [1121] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1129] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1137] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1145] nonspam nonspam nonspam nonspam nonspam nonspam
## Levels: nonspam spam
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 663 65
## spam 34 388
##
## Accuracy : 0.9139
## 95% CI : (0.8962, 0.9295)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8175
## Mcnemar's Test P-Value : 0.002569
##
## Sensitivity : 0.9512
## Specificity : 0.8565
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.9194
## Prevalence : 0.6061
## Detection Rate : 0.5765
## Detection Prevalence : 0.6330
## Balanced Accuracy : 0.9039
##
## 'Positive' Class : nonspam
##
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
dim(training)
set.seed(32323)
folds <- createFolds(y = spam$type, k = 10, list = TRUE, returnTrain = TRUE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 4141 4140 4141 4142 4140 4142 4141 4141 4140 4141
folds[[1]][1:10]
## [1] 1 2 3 4 5 6 7 8 9 10
set.seed(32323)
folds <- createResample(y = spam$type, times = 10, list = TRUE)
sapply(folds, length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06
## 4601 4601 4601 4601 4601 4601
## Resample07 Resample08 Resample09 Resample10
## 4601 4601 4601 4601
set.seed(32323)
tme <- 1:1000
folds <- createTimeSlices(y = tme, initialWindow = 20, horizon = 10)
names(folds)
## [1] "train" "test"
folds$train[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
folds$test[[1]]
## [1] 21 22 23 24 25 26 27 28 29 30
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
modelFit <- train(type ~ ., data = training, method = "glm")
args(train.default)
## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
## metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric ==
## "RMSE", FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL,
## tuneLength = 3)
## NULL
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method),
## 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
## p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE,
## verboseIter = FALSE, returnData = TRUE, returnResamp = "final",
## savePredictions = FALSE, classProbs = FALSE, summaryFunction = defaultSummary,
## selectionFunction = "best", preProcOptions = list(thresh = 0.95,
## ICAcomp = 3, k = 5), index = NULL, indexOut = NULL, timingSamps = 0,
## predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5,
## alpha = 0.05, method = "gls", complete = TRUE), allowParallel = TRUE)
## NULL
library(ISLR); library(ggplot2); library(caret)
data(Wage)
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain, ]
testing <- Wage[-inTrain, ]
dim(training); dim(testing)
## [1] 2102 12
## [1] 898 12
featurePlot(x = training[, c("age", "education", "jobclass")],
y = training$wage, plot = "pairs")
qplot(age, wage, data = training)
qq <- qplot(age, wage, colour = jobclass, data = training)
qq + geom_smooth(method = "lm", formula = y ~ x)
cutWage <- cut2(training$wage, g = 3)
table(cutWage)
## cutWage
## [ 20.1, 91.7) [ 91.7,118.9) [118.9,318.3]
## 703 721 678
p1 <- qplot(cutWage, age, data = training, fill = cutWage, geom = c("boxplot"))
p1
t1 <- table(cutWage, training$jobclass)
t1
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 91.7) 437 266
## [ 91.7,118.9) 373 348
## [118.9,318.3] 274 404
prop.table(t1, 1)
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 91.7) 0.6216216 0.3783784
## [ 91.7,118.9) 0.5173370 0.4826630
## [118.9,318.3] 0.4041298 0.5958702
qplot(wage, colour = education, data = training, geom = "density")
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
hist(training$capitalAve, main = "", xlab = "ave. capital run length")
preObj <- preProcess(training[, -58], method = c("center", "scale"))
trainCapAveS <- predict(preObj, training[, -58])$capitalAve
mean(trainCapAveS)
## [1] 4.226973e-18
sd(trainCapAveS)
## [1] 1
testCapAveS <- predict(preObj, testing[, -58])$capitalAve
mean(testCapAveS)
## [1] -0.04012184
sd(testCapAveS)
## [1] 0.2844746
preObj <- preProcess(training[, -58], method = c("BoxCox"))
trainCapAveS <- predict(preObj, training[, -58])$capitalAve
par(mfrow = c(1, 2)); hist(trainCapAveS); qqnorm(trainCapAveS)
set.seed(13343)
# make som values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1], size = 1, prob = 0.05)==1
training$capAve[selectNA] <- NA
# Impute and standardize
preObj <- preProcess(training[, -58], method = "knnImpute")
capAve <- predict(preObj, training[, -58])$capAve
# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth))/sd(capAveTruth)
quantile(capAve - capAveTruth)
## 0% 25% 50% 75% 100%
## -1.2316629250 -0.0007259377 0.0003077242 0.0008111265 0.1296147093
quantile(capAve - capAveTruth)[selectNA]
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## <NA>
## NA
quantile((capAve - capAveTruth)[!selectNA])
## 0% 25% 50% 75% 100%
## -0.7573602868 -0.0006708458 0.0003025593 0.0007748003 0.0011864402
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain, ]; testing <- Wage[-inTrain, ]
dummies <- dummyVars(wage ~ jobclass, data = training)
head(predict(dummies, newdata = training))
## jobclass.1. Industrial jobclass.2. Information
## 86582 0 1
## 11443 0 1
## 160191 1 0
## 11141 0 1
## 229379 1 0
## 86064 1 0
nsv <- nearZeroVar(training, saveMetrics = TRUE)
print(nsv)
## freqRatio percentUnique zeroVar nzv
## year 1.037356 0.33301618 FALSE FALSE
## age 1.027027 2.85442436 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.272931 0.23786870 FALSE FALSE
## race 8.938776 0.19029496 FALSE FALSE
## education 1.389002 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.000000 0.09514748 FALSE FALSE
## health 2.468647 0.09514748 FALSE FALSE
## health_ins 2.352472 0.09514748 FALSE FALSE
## logwage 1.061728 19.17221694 FALSE FALSE
## wage 1.061728 19.17221694 FALSE FALSE
library(splines)
bsBasis <- bs(training$age, df = 3)
head(bsBasis)
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.3625256 0.38669397 0.137491189
## [3,] 0.4422183 0.19539878 0.028779665
## [4,] 0.4430868 0.24369776 0.044677923
## [5,] 0.4308138 0.29109043 0.065560908
## [6,] 0.4261690 0.14823269 0.017186399
lm1 <- lm(wage ~ bsBasis, data = training)
plot(training$age, training$wage, pch = 19, cex = 0.5)
points(training$age, predict(lm1, newdata = training), col = 'red',
pch = 19, cex = 0.5)
head(predict(bsBasis, age = testing$age))
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.3625256 0.38669397 0.137491189
## [3,] 0.4422183 0.19539878 0.028779665
## [4,] 0.4430868 0.24369776 0.044677923
## [5,] 0.4308138 0.29109043 0.065560908
## [6,] 0.4261690 0.14823269 0.017186399
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8, arr.ind = TRUE)
## row col
## num415 34 32
## direct 40 32
## num857 32 34
## direct 40 34
## num857 32 40
## num415 34 40
smallSpam <- spam[, c(32, 34)]
prComp <- prcomp(smallSpam)
plot(prComp$x[, 1], prComp$x[, 2])
typeColor <- ((spam$type == "spam") * 1 + 1)
prComp <- prcomp(log10(spam[, -58] + 1))
plot(prComp$x[, 1], prComp$x[, 2], col = typeColor, xlab = "PC1",
ylab = "PC2")
library(caret); data(faithful); set.seed(333)
inTrain <- createDataPartition(y = faithful$waiting, p = 0.5, list = FALSE)
trainFaith <- faithful[inTrain, ]; testFaith <- faithful[-inTrain, ]
head(trainFaith)
## eruptions waiting
## 6 2.883 55
## 11 1.833 54
## 16 2.167 52
## 19 1.600 52
## 22 1.750 47
## 27 1.967 55
lm1 <- lm(eruptions ~ waiting, data = trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
par(mfrow = c(1, 2))
plot(trainFaith$waiting, trainFaith$eruptions)
plot(trainFaith$waiting, trainFaith$eruptions)
lines(trainFaith$waiting, lm1$fitted)
newdata <- data.frame(waiting = 80)
predict(lm1, newdata)
## 1
## 4.119307
par(mfrow = c(1, 2))
plot(trainFaith$waiting, trainFaith$eruptions)
lines(trainFaith$waiting, predict(lm1))
plot(testFaith$waiting, testFaith$eruptions)
lines(testFaith$waiting, predict(lm1, newdata = testFaith))
# RMSE train
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2))
## [1] 5.75186
# RMSE test
sqrt(sum((predict(lm1, newdata = testFaith) - testFaith$eruptions)^2))
## [1] 5.838559
pred1 <- predict(lm1, newdata = testFaith, interval = "prediction")
ord <- order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions)
matlines(testFaith$waiting[ord], pred1[ord, ], type = "l")
library(ISLR)
data(Wage); wage <- subset(Wage, select = -c(logwage))
summary(wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083
## 2. Information:1456 2. >=Very Good:2142 2. No : 917
##
##
##
##
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y = wage$wage, p = 0.7, list = FALSE)
training <- wage[inTrain, ]; testing <- wage[-inTrain, ]
dim(training); dim(testing)
## [1] 2102 11
## [1] 898 11
featurePlot(x = training[, c("age", "education", "jobclass")], y = training$wage, plot = "pairs")
qplot(age, wage, colour = jobclass, data = training)
qplot(age, wage, colour = education, data = training)
information variablemodFit <- train(wage ~ age + jobclass + education, method = "lm", data = training)
finMod <- modFit$finalModel
print(modFit)
## Linear Regression
##
## 2102 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
##
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 36.34569 0.275526 1.281115 0.02146238
##
##
plot(finMod, 1, pch = 19, cex = 0.5, col = "#00000010")
qplot(finMod$fitted, finMod$residuals, colour = race, data = training)
plot(finMod$residuals)
pred <- predict(modFit, testing)
qplot(wage, pred, colour = year, data = testing)
data(iris)
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]; testing <- iris[-inTrain, ]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
qplot(Petal.Width, Sepal.Width, colour = Species, data = training)
modFit <- train(Species ~ ., method = "rpart", data = training)
## Loading required package: rpart
print(modFit$finalModel)
## n= 105
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 36 2 versicolor (0.00000000 0.94444444 0.05555556) *
## 7) Petal.Width>=1.75 34 1 virginica (0.00000000 0.02941176 0.97058824) *
plot(modFit$finalModel, uniform = TRUE)
text(modFit$finalModel, use.n = TRUE, all = TRUE)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 3.3.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(modFit$finalModel)
predict(modFit, newdata = testing)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica versicolor virginica virginica versicolor virginica
## [37] virginica virginica virginica virginica versicolor virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
library(ElemStatLearn); data(ozone, package = "ElemStatLearn")
##
## Attaching package: 'ElemStatLearn'
##
## The following object is masked _by_ '.GlobalEnv':
##
## spam
ozone <- ozone[order(ozone$ozone), ]
head(ozone)
## ozone radiation temperature wind
## 17 1 8 59 9.7
## 19 4 25 61 9.7
## 14 6 78 57 18.4
## 45 7 48 80 14.3
## 106 7 49 69 10.3
## 7 8 19 61 20.1
ll <- matrix(NA, nrow = 10, ncol = 155)
for(i in 1:10){
ss <- sample(1:dim(ozone)[1], replace = T)
ozone0 <- ozone[ss, ]; ozone0 <- ozone0[order(ozone0$ozone), ]
loess0 <- loess(temperature ~ ozone, data = ozone0, span = 0.2)
ll[i, ] <- predict(loess0, newdata = data.frame(ozone = 1:155))
}
plot(ozone$ozone, ozone$temperature, pch = 19, cex = 0.5)
for(i in 1:10){lines(1:155, ll[i, ], col = "grey", lwd = 2)}
lines(1:155, apply(ll, 2, mean), col = "red", lwd = 2)
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]; testing <- iris[-inTrain, ]
modFit <- train(Species ~ ., data = training, method = "rf", prox = TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:Hmisc':
##
## combine
modFit
## Random Forest
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9574476 0.9354405 0.03002109 0.04553321
## 3 0.9531950 0.9290007 0.03262020 0.04953601
## 4 0.9531034 0.9288856 0.03242096 0.04902118
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
irisP <- classCenter(training[, c(3, 4)], training$Species, modFit$finalModel$prox)
irisP <- as.data.frame(irisP); irisP$Species <- rownames(irisP)
p <- qplot(Petal.Width, Petal.Length, col = Species, data = training)
p + geom_point(aes(x = Petal.Width, y = Petal.Length, col = Species), size = 5, shape = 4, data = irisP)
pred <- predict(modFit, testing)
testing$predRight <- pred == testing$Species
table(pred, testing$Species)
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 14 1
## virginica 0 1 14
qplot(Petal.Width, Petal.Length, colour = predRight, data = testing, main = "newdata Prediction")
Wage <- subset(Wage, select = -c(logwage))
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain, ]; testing <- Wage[-inTrain, ]
modFit <- train(wage ~ ., method = "gbm", data = training, verbose = FALSE)
## Loading required package: gbm
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr
##
## Attaching package: 'plyr'
##
## The following object is masked _by_ '.GlobalEnv':
##
## ozone
##
## The following object is masked from 'package:ElemStatLearn':
##
## ozone
##
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
print(modFit)
## Stochastic Gradient Boosting
##
## 2102 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared RMSE SD Rsquared SD
## 1 50 34.88119 0.3354209 1.580301 0.02651963
## 1 100 34.24918 0.3460333 1.559129 0.02628697
## 1 150 34.13728 0.3484395 1.523833 0.02698527
## 2 50 34.14742 0.3514951 1.562077 0.02677109
## 2 100 34.01654 0.3533109 1.511470 0.02645130
## 2 150 34.05669 0.3517570 1.517989 0.02755649
## 3 50 34.01561 0.3541818 1.573323 0.02701083
## 3 100 34.12517 0.3492055 1.511990 0.02609524
## 3 150 34.30884 0.3433877 1.506589 0.02553441
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth
## = 3 and shrinkage = 0.1.
qplot(predict(modFit, testing), wage, data = testing)
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]
testing <- iris[-inTrain, ]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
modlda <- train(Species ~ ., data = training, method = "lda")
# modnb <- train(Species ~ ., data = training, method = "nb")
plda = predict(modlda, testing)
# pnb = predict(modnb, testing)
# table(plda, pnb)
# equalPredictions = (plda == pnb)
# qplot(Petal.Width, Sepal.Width, colour = equalPredictions, data = testing)
data(Wage); wage <- subset(Wage, select = -c(logwage))
inBuild <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
validation <- Wage[-inBuild, ]
buildData <- Wage[inBuild, ]
inTrain <- createDataPartition(y = buildData$wage, p = 0.7, list = FALSE)
training <- buildData[inTrain, ]
tresting <- buildData[-inTrain, ]
mod1 <- train(wage ~ ., method = "glm", data = training)
mod2 <- train(wage ~ ., method = "rf", data = training, trControl = trainControl(method = "cv"), numbers = 3)
pred1 <- predict(mod1, testing)
pred2 <- predict(mod2, testing)
qplot(pred1, pred2, colour = wage, data = testing)
# The two models to completely agree
# We can try fitting a model that combines predictors
predDF <- data.frame(pred1, pred2, wage = testing$wage)
combModFit <- train(wage ~ ., method = "gam", data = predDF)
combPred <- predict(combModFit, predDF)
# check errors
sqrt(sum((pred1 - testing$wage)^2))
sqrt(sum((pred2 - testing$wage)^2))
sqrt(sum((combPred - testing$wage)^2))
# predict on validation set
pred1V <- predict(mod1, validation)
pred2V <- predict(mod2, validation)
predVDF <- data.frame(pred1 = pred1V, pred2 = pred2V)
combPredV <- predict(combModFit, predVDF)
sqrt(sum((pred1V - validation$wage)^2))
sqrt(sum((pred2V - validation$wage)^2))
sqrt(sum((combPredV - validation$wage)^2))
library(quantmod)
from.dat <- as.Date("01/01/08", format = "%m/%d/%y")
to.dat <- as.Date("12/31/13", format = "%m/%d/%y")
getSymbols("GOOG", src = "google", from = from.dat, to = to.dat)
head(GOOG)
# summarize monthly and store as time series
mGoog <- to.monthly(GOOG)
googOpen <- Op(mGoog)
ts1 <- ts(googOpen, frequency = 12)
plot(ts1, xlab = "Years +1", ylab = "GOOG")
data(iris)
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]
testing <- iris[-inTrain, ]
kMeans1 <- kmeans(subset(training, select=-c(Species)), centers = 3)
training$clusters <- as.factor(kMeans1$cluster)
qplot(Petal.Width, Petal.Length, colour = clusters, data = training)
table(kMeans1$cluster, training$Species)
##
## setosa versicolor virginica
## 1 17 0 0
## 2 0 33 35
## 3 18 2 0
modFit <- train(clusters ~ ., data = subset(training, select=-c(Species)), method = "rpart")
table(predict(modFit, training), training$Species)
##
## setosa versicolor virginica
## 1 21 1 0
## 2 0 33 35
## 3 14 1 0
testClusterPred <- predict(modFit, testing)
table(testClusterPred, testing$Species)
##
## testClusterPred setosa versicolor virginica
## 1 7 1 0
## 2 0 14 15
## 3 8 0 0